/** * Copyright 2009 T Jake Luciani * * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package lucandra; import java.io.IOException; import java.io.UnsupportedEncodingException; import java.util.Arrays; import java.util.Collection; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.StringTokenizer; import java.util.UUID; import java.util.concurrent.atomic.AtomicInteger; import org.apache.cassandra.thrift.Cassandra; import org.apache.cassandra.thrift.Column; import org.apache.cassandra.thrift.ColumnOrSuperColumn; import org.apache.cassandra.thrift.ColumnParent; import org.apache.cassandra.thrift.ConsistencyLevel; import org.apache.cassandra.thrift.SlicePredicate; import org.apache.cassandra.thrift.SliceRange; import org.apache.cassandra.thrift.SuperColumn; import org.apache.log4j.Logger; import org.apache.lucene.analysis.SimpleAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.FieldSelector; import org.apache.lucene.document.Field.Index; import org.apache.lucene.document.Field.Store; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.index.TermDocs; import org.apache.lucene.index.TermEnum; import org.apache.lucene.index.TermFreqVector; import org.apache.lucene.index.TermPositions; import org.apache.lucene.index.TermVectorMapper; import org.apache.lucene.index.IndexWriter.MaxFieldLength; import org.apache.lucene.search.Similarity; import org.apache.lucene.store.Directory; import org.apache.lucene.store.LockObtainFailedException; import org.apache.lucene.store.RAMDirectory; import solandra.SolandraFieldSelector; public class IndexReader extends org.apache.lucene.index.IndexReader { private final static int numDocs = 1000000; private final static Directory mockDirectory = new RAMDirectory(); static { try { new IndexWriter(mockDirectory, new SimpleAnalyzer(), true, MaxFieldLength.LIMITED); } catch (CorruptIndexException e) { throw new RuntimeException(e); } catch (LockObtainFailedException e) { throw new RuntimeException(e); } catch (IOException e) { throw new RuntimeException(e); } } private final String indexName; private final Cassandra.Iface client; private final ThreadLocal<Map<String, Integer>> docIdToDocIndex = new ThreadLocal<Map<String, Integer>>(); private final ThreadLocal<Map<Integer, String>> docIndexToDocId = new ThreadLocal<Map<Integer, String>>(); private final ThreadLocal<Map<Integer, Document>> documentCache = new ThreadLocal<Map<Integer, Document>>(); private final ThreadLocal<AtomicInteger> docCounter = new ThreadLocal<AtomicInteger>(); private final ThreadLocal<Map<Term, LucandraTermEnum>> termEnumCache = new ThreadLocal<Map<Term, LucandraTermEnum>>(); private final ThreadLocal<Map<String,byte[]>> fieldNorms = new ThreadLocal<Map<String, byte[]>>(); private final static ThreadLocal<Object> fieldCacheRefs = new ThreadLocal<Object>(); private static final Logger logger = Logger.getLogger(IndexReader.class); public IndexReader(String name, Cassandra.Iface client) { super(); this.indexName = name; this.client = client; } public synchronized IndexReader reopen() throws CorruptIndexException, IOException { clearCache(); return this; } @Override public Object getFieldCacheKey() { Object ref = fieldCacheRefs.get(); if(ref == null){ ref = UUID.randomUUID(); fieldCacheRefs.set(ref); } return ref; } public void clearCache() { if(docCounter.get() != null) docCounter.get().set(0); if(docIdToDocIndex.get() != null) docIdToDocIndex.get().clear(); if(docIndexToDocId.get() != null) docIndexToDocId.get().clear(); if(termEnumCache.get() != null) termEnumCache.get().clear(); if(documentCache.get() != null) documentCache.get().clear(); if(fieldNorms.get() != null) fieldNorms.get().clear(); if (fieldCacheRefs.get() != null) fieldCacheRefs.set(UUID.randomUUID()); } protected void doClose() throws IOException { clearCache(); } protected void doCommit() throws IOException { clearCache(); } protected void doDelete(int arg0) throws CorruptIndexException, IOException { } protected void doSetNorm(int arg0, String arg1, byte arg2) throws CorruptIndexException, IOException { } protected void doUndeleteAll() throws CorruptIndexException, IOException { } public int docFreq(Term term) throws IOException { LucandraTermEnum termEnum = getTermEnumCache().get(term); if (termEnum == null) { long start = System.currentTimeMillis(); termEnum = new LucandraTermEnum(this); termEnum.skipTo(term); long end = System.currentTimeMillis(); logger.debug("docFreq() took: " + (end - start) + "ms"); getTermEnumCache().put(term, termEnum); } return termEnum.docFreq(); } public Document document(int docNum, FieldSelector selector) throws CorruptIndexException, IOException { Document doc = getDocumentCache().get(docNum); if (doc != null){ logger.debug("Found doc in cache"); return doc; } String docId = getDocIndexToDocId().get(docNum); if (docId == null) return null; Map<Integer, String> keyMap = new HashMap<Integer, String>(); keyMap.put(docNum, CassandraUtils.hashKey(indexName + CassandraUtils.delimeter + docId)); List<byte[]> fieldNames = null; // Special field selector used to carry list of other docIds to cache in // Parallel for Solr Performance if (selector != null && selector instanceof SolandraFieldSelector) { List<Integer> otherDocIds = ((SolandraFieldSelector) selector).getOtherDocsToCache(); fieldNames = ((SolandraFieldSelector) selector).getFieldNames(); logger.debug("Going to bulk load "+otherDocIds.size()+" documents"); for (Integer otherDocNum : otherDocIds) { if (otherDocNum == docNum) continue; if (getDocumentCache().containsKey(otherDocNum)) continue; String docKey = getDocIndexToDocId().get(otherDocNum); if (docKey == null) continue; keyMap.put(otherDocNum, CassandraUtils.hashKey(indexName + CassandraUtils.delimeter + docKey)); } } ColumnParent columnParent = new ColumnParent(); columnParent.setColumn_family(CassandraUtils.docColumnFamily); SlicePredicate slicePredicate = new SlicePredicate(); if (fieldNames == null || fieldNames.size() == 0) { // get all columns ( except this skips meta info ) slicePredicate.setSlice_range(new SliceRange(new byte[] {}, CassandraUtils.finalToken.getBytes("UTF-8"), false, 100)); } else { slicePredicate.setColumn_names(fieldNames); } long start = System.currentTimeMillis(); try { Map<String, List<ColumnOrSuperColumn>> docMap = client.multiget_slice(CassandraUtils.keySpace, Arrays.asList(keyMap.values().toArray( new String[] {})), columnParent, slicePredicate, ConsistencyLevel.ONE); for (Map.Entry<Integer, String> key : keyMap.entrySet()) { List<ColumnOrSuperColumn> cols = docMap.get(key.getValue()); if (cols == null) { logger.warn("Missing document in multiget_slice for: " + key.getValue()); continue; } Document cacheDoc = new Document(); for (ColumnOrSuperColumn col : cols) { Field field = null; String fieldName = new String(col.column.name); //Incase __META__ slips through if(Arrays.equals(col.column.name,CassandraUtils.documentMetaField.getBytes())){ logger.debug("Filtering out __META__ key"); continue; } byte[] value; if (col.column.value[col.column.value.length - 1] != Byte.MAX_VALUE && col.column.value[col.column.value.length - 1] != Byte.MIN_VALUE) { throw new CorruptIndexException("Lucandra field is not properly encoded: "+docId+"("+fieldName+")"); } else if (col.column.value[col.column.value.length - 1] == Byte.MAX_VALUE) { //Binary value = new byte[col.column.value.length - 1]; System.arraycopy(col.column.value, 0, value, 0, col.column.value.length - 1); field = new Field(fieldName, value, Store.YES); cacheDoc.add(field); } else if (col.column.value[col.column.value.length - 1] == Byte.MIN_VALUE) { //String value = new byte[col.column.value.length - 1]; System.arraycopy(col.column.value, 0, value, 0, col.column.value.length - 1); //Check for multi-fields String fieldString = new String(value,"UTF-8"); if(fieldString.indexOf(CassandraUtils.delimeter) >= 0 ){ StringTokenizer tok = new StringTokenizer(fieldString,CassandraUtils.delimeter); while(tok.hasMoreTokens()) { field = new Field(fieldName, tok.nextToken(), Store.YES, Index.ANALYZED); cacheDoc.add(field); } }else{ field = new Field(fieldName, fieldString, Store.YES, Index.ANALYZED); cacheDoc.add(field); } } } //Mark the required doc if(key.getKey().equals(docNum)) doc = cacheDoc; getDocumentCache().put(key.getKey(),cacheDoc); } long end = System.currentTimeMillis(); logger.debug("Document read took: " + (end - start) + "ms"); return doc; } catch (Exception e) { throw new IOException(e.getLocalizedMessage()); } } @Override public Collection getFieldNames(FieldOption fieldOption) { return Arrays.asList(new String[] {}); } @Override public TermFreqVector getTermFreqVector(int docNum, String field) throws IOException { String docId = getDocIndexToDocId().get(docNum); TermFreqVector termVector = new lucandra.TermFreqVector(indexName, field, docId, client); return termVector; } @Override public void getTermFreqVector(int arg0, TermVectorMapper arg1) throws IOException { throw new RuntimeException(); } @Override public void getTermFreqVector(int arg0, String arg1, TermVectorMapper arg2) throws IOException { throw new RuntimeException(); } @Override public TermFreqVector[] getTermFreqVectors(int arg0) throws IOException { throw new RuntimeException(); } @Override public boolean hasDeletions() { return false; } @Override public boolean isDeleted(int arg0) { return false; } @Override public int maxDoc() { // if (numDocs == null) // numDocs(); return numDocs + 1; } @Override public byte[] norms(String field) throws IOException { return getFieldNorms().get(field); } @Override public void norms(String arg0, byte[] arg1, int arg2) throws IOException { throw new RuntimeException("This operation is not supported"); } @Override public int numDocs() { return numDocs; } @Override public TermDocs termDocs() throws IOException { return new LucandraTermDocs(this); } @Override public TermPositions termPositions() throws IOException { return new LucandraTermDocs(this); } @Override public TermEnum terms() throws IOException { return new LucandraTermEnum(this); } @Override public TermEnum terms(Term term) throws IOException { LucandraTermEnum termEnum = getTermEnumCache().get(term); if(termEnum == null) termEnum = new LucandraTermEnum(this); if( !termEnum.skipTo(term) ) //if found in the cache then reset, otherwise init. termEnum = null; return termEnum; } public int addDocument(SuperColumn docInfo, String field) { String id; try { id = new String(docInfo.name, "UTF-8"); } catch (UnsupportedEncodingException e) { throw new IllegalStateException("Cant make docId a string"); } Integer idx = getDocIdToDocIndex().get(id); if (idx == null) { idx = getDocCounter().incrementAndGet(); if (idx > numDocs) throw new IllegalStateException("numDocs reached"); getDocIdToDocIndex().put(id, idx); getDocIndexToDocId().put(idx, id); Byte norm = null; for(Column c : docInfo.columns){ if(Arrays.equals(c.name, CassandraUtils.normsKey.getBytes())){ if(c.value.length != 1) throw new IllegalStateException("Norm for field "+field+" must be a single byte"); norm = c.value[0]; } } if(norm == null) norm = Similarity.encodeNorm(1.0f); byte[] norms = getFieldNorms().get(field); if (norms == null) norms = new byte[1024]; while(norms.length <= idx && norms.length < numDocs ){ byte[] _norms = new byte[(norms.length * 2) < numDocs ? (norms.length * 2) : (numDocs + 1)]; System.arraycopy(norms, 0, _norms, 0, norms.length); norms = _norms; } // find next empty position norms[idx] = norm; getFieldNorms().put(field, norms); } return idx; } public int getDocumentNumber(byte[] docId){ String id; try { id = new String(docId, "UTF-8"); } catch (UnsupportedEncodingException e) { throw new IllegalStateException("Cant make docId a string"); } return getDocIdToDocIndex().get(id); } public String getDocumentId(int docNum) { return getDocIndexToDocId().get(docNum); } public String getIndexName() { return indexName; } public Cassandra.Iface getClient() { return client; } public LucandraTermEnum checkTermCache(Term term) { return getTermEnumCache().get(term); } public void addTermEnumCache(Term term, LucandraTermEnum termEnum) { getTermEnumCache().put(term, termEnum); } @Override public Directory directory() { clearCache(); return mockDirectory; } @Override public long getVersion() { return 1; } @Override public boolean isOptimized() { return true; } @Override public boolean isCurrent() { return true; } public Map<Integer, String> getDocIndexToDocId() { Map<Integer, String> c = docIndexToDocId.get(); if(c == null){ c = new HashMap<Integer,String>(); docIndexToDocId.set(c); } return c; } private Map<String,Integer> getDocIdToDocIndex(){ Map<String, Integer> c = docIdToDocIndex.get(); if(c == null){ c = new HashMap<String,Integer>(); docIdToDocIndex.set(c); } return c; } private AtomicInteger getDocCounter(){ AtomicInteger c = docCounter.get(); if(c == null){ c = new AtomicInteger(0); docCounter.set(c); } return c; } private Map<Term,LucandraTermEnum> getTermEnumCache(){ Map<Term,LucandraTermEnum> c = termEnumCache.get(); if(c == null){ c = new HashMap<Term,LucandraTermEnum>(); termEnumCache.set(c); } return c; } private Map<Integer,Document> getDocumentCache(){ Map<Integer,Document> c = documentCache.get(); if(c == null){ c = new HashMap<Integer,Document>(); documentCache.set(c); } return c; } private Map<String,byte[]> getFieldNorms(){ Map<String, byte[]> c = fieldNorms.get(); if(c == null){ c = new HashMap<String,byte[]>(); fieldNorms.set(c); } return c; } }